In [ ]:
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
plt.rcParams["figure.dpi"] = 200

In [ ]:
from sklearn.datasets import load_boston
boston = load_boston()
from sklearn.model_selection import train_test_split
X, y = boston.data, boston.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0)

In [ ]:
print(boston.DESCR)

In [ ]:
fig, axes = plt.subplots(3, 5, figsize=(20, 10))
for i, ax in enumerate(axes.ravel()):
    if i > 12:
        ax.set_visible(False)
        continue
    ax.plot(X[:, i], y, 'o', alpha=.5)
    ax.set_title("{}: {}".format(i, boston.feature_names[i]))
    ax.set_ylabel("MEDV")

In [ ]:
plt.boxplot(X)
plt.xticks(np.arange(1, X.shape[1] + 1),
           boston.feature_names, rotation=30, ha="right")
plt.ylabel("MEDV")

In [ ]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [ ]:
from sklearn.model_selection import cross_val_score

In [ ]:
from sklearn.neighbors import KNeighborsRegressor
scores = cross_val_score(KNeighborsRegressor(),
                         X_train, y_train, cv=10)
np.mean(scores), np.std(scores)

In [ ]:
from sklearn.neighbors import KNeighborsRegressor
scores = cross_val_score(KNeighborsRegressor(),
                         X_train_scaled, y_train, cv=10)
np.mean(scores), np.std(scores)

Categorical Variables


In [ ]:
import pandas as pd
df = pd.DataFrame({'salary': [103, 89, 142, 54, 63, 219],
                   'boro': ['Manhatten', 'Queens', 'Manhatten', 'Brooklyn', 'Brooklyn', 'Bronx']})
df

In [ ]:
pd.get_dummies(df)

In [ ]:
df = pd.DataFrame({'salary': [103, 89, 142, 54, 63, 219],
                   'boro': [0, 1, 0, 2, 2, 3]})
df

In [ ]:
pd.get_dummies(df, columns=['boro'])

Exercise

Apply dummy encoding and scaling to the "adult" dataset consisting of income data from the census.

Bonus: visualize the data.


In [ ]:
data = pd.read_csv("adult.csv", index_col=0)

In [ ]:
# %load solutions/load_adult.py